#Load necessary libraries and templates

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(broom)
## Warning: package 'broom' was built under R version 4.0.5
library(dplyr)

theme_template <- theme(legend.position="right", plot.title = element_text(size=11)) +
  theme(axis.text.x= element_text(size =10)) + 
  theme(axis.text.y= element_text(size =15)) + 
  theme(axis.title = element_text(size = 15,face="bold")) +
  theme(plot.title = element_text(size = 14, hjust = 0.5)) +
  theme(axis.line = element_line(colour = "black", size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.key=element_blank(),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)) 



#Read in files
covid_data_cap <- read.csv("/Users/lorenzocapitani/OneDrive\ -\ Cardiff\ University/Bioinformatics/MartinPaper/SC2_Key\ Data_Lorenzo_Cap.csv")
covid_data_vein <- read.csv("/Users/lorenzocapitani/OneDrive\ -\ Cardiff\ University/Bioinformatics/MartinPaper/SC2_Key\ Data_Lorenzo_Vein.csv")

names(covid_data_cap)
##  [1] "Participant.ID"            "COVID.19.Positive."       
##  [3] "IFNg"                      "RBD.IgG"                  
##  [5] "N.IgG"                     "Vaccinated."              
##  [7] "Prior.COVID.19."           "Gender"                   
##  [9] "Significant.co.morbidity." "Age"                      
## [11] "Ethnicity"
#Clean dataset: convert binary factors to 1 or 0 
covid_data_cap$COVID.19.Positive. <- ifelse(test=covid_data_cap$COVID.19.Positive. == "Y", yes=1, no=0)
covid_data_cap$Vaccinated. <- ifelse(test=covid_data_cap$Vaccinated. == "Y", yes=1, no=0)
covid_data_cap$Prior.COVID.19. <- ifelse(test=covid_data_cap$Prior.COVID.19. == "Y", yes=1, no=0)
covid_data_cap[covid_data_cap$Gender == "male",]$Gender <- 0
covid_data_cap[covid_data_cap$Gender == "female",]$Gender <- 1
covid_data_cap$Gender <- as.integer(covid_data_cap$Gender)
covid_data_cap$Significant.co.morbidity. <- ifelse(test=covid_data_cap$Significant.co.morbidity. == "Y", yes=1, no=0)

#Repeat for venous dataset
covid_data_vein$COVID.19.Positive. <- ifelse(test=covid_data_vein$COVID.19.Positive. == "Y", yes=1, no=0)
covid_data_vein$Vaccinated. <- ifelse(test=covid_data_vein$Vaccinated. == "Y", yes=1, no=0)
covid_data_vein$Prior.COVID.19. <- ifelse(test=covid_data_vein$Prior.COVID.19. == "Y", yes=1, no=0)
covid_data_vein[covid_data_vein$Gender == "M",]$Gender <- 0
covid_data_vein[covid_data_vein$Gender == "F",]$Gender <- 1
covid_data_vein$Gender <- as.integer(covid_data_vein$Gender)

#A visual take on the missing values 
library(Amelia)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.0.5
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(covid_data_vein, main = "Missing values vs observed in vein dataset")

missmap(covid_data_cap, main = "Missing values vs observed in cap dataset")

# Mean interpolation for the few missing values present
covid_data_vein$Age[is.na(covid_data_vein$Age)] <- mean(covid_data_vein$Age,na.rm=T)
covid_data_vein$RBD.IgG[is.na(covid_data_vein$RBD.IgG)] <- mean(covid_data_vein$RBD.IgG,na.rm=T)

covid_data_cap$Age[is.na(covid_data_cap$Age)] <- mean(covid_data_cap$Age,na.rm=T)
covid_data_cap$IFNg[is.na(covid_data_cap$IFNg)] <- mean(covid_data_cap$IFNg,na.rm=T)


missmap(covid_data_vein, main = "Missing values vs observed in vein dataset")

missmap(covid_data_cap, main = "Missing values vs observed in cap dataset")

#Let's have a look at the data - not used in publication, solyely exploratory


covid_data_cap <- covid_data_cap %>% add_column("Origin" = "Capillary")
covid_data_vein <- covid_data_vein %>% add_column("Origin" = "Venous")

numeric_covid_merged <- rbind(covid_data_cap[,c(2,3,4,6,7,8,10,12)],  covid_data_vein[,c(2,3,4,5,6,7,8,9)])

#Age
ggplot(numeric_covid_merged, aes(x=Age, color=Origin)) +
  geom_histogram(alpha=0.5, position = "dodge") +
  geom_vline(data=numeric_covid_merged[numeric_covid_merged$Origin == "Capillary",],aes(xintercept=mean(Age), color=Origin),
             linetype="dashed") + 
  geom_vline(data=numeric_covid_merged[numeric_covid_merged$Origin == "Venous",],aes(xintercept=mean(Age), color=Origin),
             linetype="dashed") + theme_template
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Gender
table1 = table(numeric_covid_merged$Gender, numeric_covid_merged$Origin)
rownames(table1) <- c("Male", "Female")

mosaicplot(table1, main = "Mosaic Plot of Gender in the two datasets")

#Age table to look at infections
numeric_covid_merged$Age <- ifelse(numeric_covid_merged$Age > 50, "Over 50", "Under 50")
table1 = table(numeric_covid_merged$Age, numeric_covid_merged$COVID.19.Positive., numeric_covid_merged$Origin)
colnames(table1) <- c("Not infection", "Infected")

mosaicplot(table1, main = "Mosaic Plot of Age vs COVID infection")

#Gender
table1 = table(numeric_covid_merged$Gender, numeric_covid_merged$COVID.19.Positive., numeric_covid_merged$Origin)
rownames(table1) <- c("Male", "Female")
colnames(table1) <- c("Not infection", "Infected")
mosaicplot(table1, main = "Mosaic Plot of Gender in the two datasets")

#Vaccination status
table1 = table(numeric_covid_merged$Vaccinated., numeric_covid_merged$Origin)
rownames(table1) <- c("No", "Yes")

mosaicplot(table1, main = "Vaccination status")

#Prior infection  status
table1 = table(numeric_covid_merged$Prior.COVID.19., numeric_covid_merged$Origin)
rownames(table1) <- c("No", "Yes")

mosaicplot(table1, main = "Prior COVID infection status")

#Preliminary univariate analysis of variables from venous dataset - not used in publication

library(ggplot2)
library(ggbeeswarm)
library(ggpubr)

numeric_covid_vein <- covid_data_vein[,c(2,3,4,5,6,7,8)]
numeric_covid_vein$COVID.19.Positive. <- factor(numeric_covid_vein$COVID.19.Positive.)
numeric_covid_vein$Vaccinated. <- factor(numeric_covid_vein$Vaccinated.)
numeric_covid_vein$Prior.COVID.19. <- factor(numeric_covid_vein$Prior.COVID.19.)
numeric_covid_vein$Gender <- factor(numeric_covid_vein$Gender)


ggplot(numeric_covid_vein, aes(x = COVID.19.Positive., y = IFNg)) +
  geom_violin() +
  geom_quasirandom(width = 0.2) +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("IFNg (pg/ml)") +
  theme_template + stat_compare_means(label.x.npc = 0.4)

ggplot(numeric_covid_vein, aes(x = COVID.19.Positive., y = RBD.IgG)) +
  geom_violin() +
  geom_quasirandom(width = 0.2) +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Ant-RBD IgG titres") +
  theme_template +
  stat_compare_means(label.x.npc = 0.4)

ggplot(numeric_covid_vein, aes(x = COVID.19.Positive., fill = Vaccinated.)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Vaccinated") +
  scale_fill_manual(labels = c("No", "Yes"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_vein, aes(x = COVID.19.Positive., fill = Prior.COVID.19.)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Prior COVID infection") +
  scale_fill_manual(labels = c("No", "Yes"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_vein, aes(x = COVID.19.Positive., fill = Gender)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Gender") +
  scale_fill_manual(labels = c("Male", "Female"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_vein, aes(x = COVID.19.Positive., y = Age)) +
    geom_violin() +
  geom_quasirandom(width = 0.2) +
  xlab("Breakthrough COVID19 infection") +
  ylab("Age") +
  theme_template +
    scale_x_discrete(labels = c("No", "Yes"))+
  stat_compare_means(label.x.npc = 0.4)

Univariate analysis of capillary dataset

#Preliminary univariate analysis of variables from capillary dataset - not used in publication


numeric_covid_cap <- covid_data_cap[,c(2,3,4,5,6,7,8,9,10)]
numeric_covid_cap$COVID.19.Positive. <- factor(numeric_covid_cap $COVID.19.Positive.)
numeric_covid_cap$Vaccinated. <- factor(numeric_covid_cap $Vaccinated.)
numeric_covid_cap$Prior.COVID.19. <- factor(numeric_covid_cap $Prior.COVID.19.)
numeric_covid_cap$Gender <- factor(numeric_covid_cap $Gender)
numeric_covid_cap$Significant.co.morbidity. <- factor(numeric_covid_cap$Significant.co.morbidity.)

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., y = IFNg)) +
  geom_violin() +
  geom_quasirandom(width = 0.2) +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("IFNg (pg/ml)") +
  theme_template + stat_compare_means(label.x.npc = 0.4)

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., y = RBD.IgG)) +
  geom_violin() +
  geom_quasirandom(width = 0.2) +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Ant-RBD IgG titres") +
  theme_template +
  stat_compare_means(label.x.npc = 0.4)

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., fill = Vaccinated.)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Vaccinated") +
  scale_fill_manual(labels = c("No", "Yes"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., fill = Prior.COVID.19.)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Prior COVID infection") +
  scale_fill_manual(labels = c("No", "Yes"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., fill = Gender)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Gender") +
  scale_fill_manual(labels = c("Male", "Female"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., fill = Significant.co.morbidity.)) +
  geom_bar(stat ='count',position = "dodge2") +
  scale_x_discrete(labels = c("No", "Yes")) + 
  xlab("Breakthrough COVID19 infection") +
  ylab("Count") +
  theme_template +
  labs(fill = "Significant comorbidity") +
  scale_fill_manual(labels = c("Male", "Female"), values = c("brown3", "darkgoldenrod1")) 

ggplot(numeric_covid_cap , aes(x = COVID.19.Positive., y = Age)) +
    geom_violin() +
  geom_quasirandom(width = 0.2) +
  xlab("Breakthrough COVID19 infection") +
  ylab("Age") +
  theme_template +
    scale_x_discrete(labels = c("No", "Yes"))+
  stat_compare_means(label.x.npc = 0.4)

options(warn=-1)
# Spearman's Rank Correlation Matrix included in publication
library(corrplot)
## corrplot 0.92 loaded
numeric_covid_cap <- covid_data_cap[,c(2,3,4,5,6,7,8,9,10)]
numeric_covid_vein <- covid_data_vein[,c(2,3,4,5,6,7,8)]
covid_vein_cor_matrix <- cor(numeric_covid_vein, use = "complete.obs", method = "spearman")
covid_cap_cor_matrix <- cor(numeric_covid_cap, use = "complete.obs", method = "spearman")

#Obtain p-value matrix for each comparison and fix for multiple comparisons 
testRes_vein = cor.mtest(numeric_covid_vein, conf.level = 0.95,  method = "spearman", adjust="holm")
testRes_cap = cor.mtest(numeric_covid_cap, conf.level = 0.95,  method = "spearman", adjust="holm")

corrplot(covid_vein_cor_matrix, p.mat = testRes_vein$p, sig.level = 0.05, method = 'square', order="original", tl.cex = 2, tl.col = "black", insig = "blank", type = 'lower') 
title(main = "Vein dataset", cex.main =2, line = 0)

corrplot(covid_cap_cor_matrix, p.mat = testRes_cap$p, sig.level = 0.05, method = 'square', order="original", tl.cex = 2, tl.col = "black", insig = "blank", type = 'lower') 
title(main = "Capillary dataset", cex.main =2, line = 0)

# Binary Logisitic Regression model development
library(ggplot2)
library(OddsPlotty)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidyverse)
library(rms)
## Loading required package: Hmisc
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
# Generate testdata for manual scaling
testdata <- numeric_covid_vein

 # Set up null model
model_null <- glm(COVID.19.Positive.~ 1, data=numeric_covid_vein, family=binomial(link='logit'))
 # Set up full model
model_vein <- glm(COVID.19.Positive.~ ., data=numeric_covid_vein, family=binomial(link='logit'))
summary(model_vein)
## 
## Call:
## glm(formula = COVID.19.Positive. ~ ., family = binomial(link = "logit"), 
##     data = numeric_covid_vein)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0769  -0.6280  -0.3800  -0.1281   2.9647  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      0.1885773  1.8968081   0.099   0.9208  
## IFNg            -0.0037624  0.0016641  -2.261   0.0238 *
## RBD.IgG         -0.0008458  0.0006665  -1.269   0.2044  
## Vaccinated.     -0.6651538  1.4871478  -0.447   0.6547  
## Prior.COVID.19. -1.1192991  0.6361027  -1.760   0.0785 .
## Gender           0.3391404  0.6257416   0.542   0.5878  
## Age              0.0044845  0.0222180   0.202   0.8400  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127.52  on 146  degrees of freedom
## Residual deviance: 106.12  on 140  degrees of freedom
## AIC: 120.12
## 
## Number of Fisher Scoring iterations: 6
#Derive model pvalue and pseudo R^2
model_pval <- anova(model_null, model_vein, test = 'LRT')
rsqrd <- lrm(COVID.19.Positive.~ ., data=testdata)


#Derive odds ratios with 95% CI and plot
odds <- odds_plot(model_vein)
## Waiting for profiling to be done...
odds <- odds$odds_data
odds$OR <- log(odds$OR)
odds$lower <- log(odds$lower)
odds$upper <- log(odds$upper)


# Visualisations of odds ratios was performed in R but the final figures in the publication were generated in GraphPad using the same values 
ggplot(odds, aes(x = OR, y = vars)) + 
    geom_vline(aes(xintercept = 0), size = .25, linetype = "dashed") + 
    geom_errorbarh(aes(xmax = upper, xmin = lower), size = .5, height = 
                    .2, color = "gray50") +
    geom_point(size = 3.5, color = "orange") +
    theme_bw()+
    theme(panel.grid.minor = element_blank()) +
    ylab("") +
    xlab("log(Odds ratio)") +
    annotate(geom = "text", y =1.1, x = log10(13), 
                     label = paste("Model p <", round(model_pval$`Pr(>Chi)`[2], digits = 4)), size = 3.5, hjust = 4) + 
  annotate(geom = "text", y =0.7, x = log10(13), 
                     label = paste("Pseudo R^2 = ", round(rsqrd$stats[10][1], digits = 2)), size = 3.5, hjust = 3.53) + 
    ggtitle("Parameters and the risk of COVID infections")

# Visualize odd_plots for capillary
library(ggplot2)
library(OddsPlotty)
library(caret)
library(tidyverse)
library(rms)



# Generate testdata for manual scaling
testdata <- numeric_covid_cap
testdata$COVID.19.Positive. <- as.factor(testdata$COVID.19.Positive.)
testdata$Vaccinated. <- as.factor(testdata$Vaccinated. )
testdata$Prior.COVID.19. <- as.factor(testdata$Prior.COVID.19. )
testdata$Gender <- as.factor(testdata$Gender)
testdata$Significant.co.morbidity. <- as.factor(testdata$Significant.co.morbidity. )


 # Set up null model
model_null <- glm(COVID.19.Positive.~ 1, data=testdata, family=binomial(link='logit'))
model_cap <- glm(COVID.19.Positive.~ ., data=testdata, family=binomial(link='logit'))
summary(model_cap)
## 
## Call:
## glm(formula = COVID.19.Positive. ~ ., family = binomial(link = "logit"), 
##     data = testdata)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.04081  -0.34264  -0.17642  -0.02441   2.63824  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                 0.3828300  1.6302669   0.235   0.8143  
## IFNg                       -0.0161447  0.0092863  -1.739   0.0821 .
## RBD.IgG                     0.0009258  0.0015421   0.600   0.5483  
## N.IgG                      -0.0088809  0.0132775  -0.669   0.5036  
## Vaccinated.1               -1.2443234  1.5232723  -0.817   0.4140  
## Prior.COVID.19.1            0.0319377  0.8213686   0.039   0.9690  
## Gender1                     0.9732765  0.8596080   1.132   0.2575  
## Significant.co.morbidity.1 -1.2720622  1.1029923  -1.153   0.2488  
## Age                        -0.0449843  0.0258653  -1.739   0.0820 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.851  on 196  degrees of freedom
## Residual deviance: 65.472  on 188  degrees of freedom
## AIC: 83.472
## 
## Number of Fisher Scoring iterations: 9
model_pval <- anova(model_null, model_cap, test = 'LRT')
rsqrd <- lrm(COVID.19.Positive.~ ., data=testdata)

# Visualisations of odds ratios was performed in R but the final figures in the publication were generated in GraphPad using the same values 
ggplot(odds, aes(x = OR, y = vars)) + 
    geom_vline(aes(xintercept = 0), size = .25, linetype = "dashed") + 
    geom_errorbarh(aes(xmax = upper, xmin = lower), size = .5, height = 
                    .2, color = "gray50") +
    geom_point(size = 3.5, color = "orange") +
    theme_bw()+
    theme(panel.grid.minor = element_blank()) +
    ylab("") +
    xlab("log(Odds ratio)") +
    annotate(geom = "text", y =1.1, x = log10(13), 
                     label = paste("Model p <", round(model_pval$`Pr(>Chi)`[2], digits = 4)), size = 3.5, hjust = 4) + 
  annotate(geom = "text", y =0.7, x = log10(13), 
                     label = paste("Pseudo R^2 = ", round(rsqrd$stats[10][1], digits = 2)), size = 3.5, hjust = 3.53) + 
    ggtitle("Parameters and the risk of COVID infections")

library(grt)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'grt'
## The following object is masked from 'package:base':
## 
##     scale
library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:MASS':
## 
##     area
#Exploring the impact of IFng and RBD on the predicted probability of infection derived from the logistic regression model
# Exploratory in nature and not included in the publication 

predicted.data_unscaled <- data.frame(
  probability.of.infection =model_vein$fitted.values,
  IFNg=numeric_covid_vein$IFNg,
  RBD = numeric_covid_vein$RBD.IgG,
  PriorInfection = as.factor(numeric_covid_vein$Prior.COVID.19.),
  COVIDPositive = as.factor(numeric_covid_vein$COVID.19.Positive.))


IFNg <- ggplot(data=predicted.data_unscaled, aes(x=IFNg, y=probability.of.infection)) +
  geom_point(aes(colour = PriorInfection), show.legend = F) +
  xlab("IFNg (pg/ml)") +
  ylab("Predicted probability") +
  labs(color='Prior Infection Status') +
  scale_color_manual(labels = c("No", "Yes"), values = c("brown3", "darkgoldenrod1")) +
  #ggtitle("Predicted probability of getting a breakthrough infection \n based on metrics from venous samples") +
   # scale_x_continuous(trans = "log", breaks = c(10,100,1000)) +
  theme(plot.title = element_text(size = 14, face = 'bold'),
    axis.title.x = element_text(size = 12, face = 'bold'),
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    axis.title.y = element_text(size = 12, face = 'bold'),
    legend.title = element_text(size = 12)) + 
  theme(axis.line = element_line(colour = "black", size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.key=element_blank())

RBD <- ggplot(data=predicted.data_unscaled, aes(x=RBD, y=probability.of.infection)) +
  geom_point(aes(colour = PriorInfection)) +
  xlab("Anti-RBD IgG") +
  ylab("Predicted probability") +
  labs(color='Prior Infection Status') +
  scale_color_manual(labels = c("No", "Yes"), values = c("brown3", "darkgoldenrod1")) +
 #ggtitle("Predicted probability of getting a breakthrough infection \n based on metrics from venous samples") +
    #scale_x_continuous(trans = "log", breaks = c(10,100,1000)) +
  theme(plot.title = element_text(size = 14, face = 'bold'),
    axis.title.x = element_text(size = 12, face = 'bold'),
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    axis.title.y = element_text(size = 12, face = 'bold'),
    legend.title = element_text(size = 12)) + 
  theme(axis.line = element_line(colour = "black", size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.key=element_blank())


IFNg + RBD

# Relationship between IFNg and infection probability not linear
# Based on this the analysis was repeated using quartile groups
library(caret)
library(OddsPlotty)
library(ggplot2)
library(OddsPlotty)
library(caret)
library(tidyverse)
library(rms)

#Conversuin of IFNg/RBD into a quartile-group based factors

numeric_covid_vein <- covid_data_vein[,c(2,3,4,5,6,7,8)]


q1 <- summary(numeric_covid_vein$IFNg)[2]
q2 <- summary(numeric_covid_vein$IFNg)[3]
q3 <- summary(numeric_covid_vein$IFNg)[5]

q1r <- summary(numeric_covid_vein$RBD.IgG)[2]
q2r <- summary(numeric_covid_vein$RBD.IgG)[3]
q3r <- summary(numeric_covid_vein$RBD.IgG)[5]

numeric_covid_vein[numeric_covid_vein$IFNg < q1,]$IFNg <- 1
numeric_covid_vein[numeric_covid_vein$IFNg >= q1 & numeric_covid_vein$IFNg < q2,]$IFNg <- 2
numeric_covid_vein[numeric_covid_vein$IFNg >= q2 & numeric_covid_vein$IFNg < q3,]$IFNg <- 3
numeric_covid_vein[numeric_covid_vein$IFNg >= q3,]$IFNg <- 4

numeric_covid_vein[numeric_covid_vein$RBD.IgG < q1r,]$RBD.IgG <- 1
numeric_covid_vein[numeric_covid_vein$RBD.IgG >= q1r & numeric_covid_vein$RBD.IgG < q2r,]$RBD.IgG <- 2
numeric_covid_vein[numeric_covid_vein$RBD.IgG >= q2r & numeric_covid_vein$RBD.IgG < q3r,]$RBD.IgG <- 3
numeric_covid_vein[numeric_covid_vein$RBD.IgG >= q3r,]$RBD.IgG <- 4



numeric_covid_vein$IFNg <- factor(numeric_covid_vein$IFNg, levels = c(1,2,3,4))
numeric_covid_vein$RBD.IgG <- factor(numeric_covid_vein$RBD.IgG, levels = c(1,2,3,4))


#Repeat of glm
model_vein <- glm(COVID.19.Positive.~ ., data=numeric_covid_vein, family=binomial(link='logit'))

model_null <- glm(COVID.19.Positive.~ 1, data=numeric_covid_vein, family=binomial(link='logit'))

model_pval <- anova(model_null, model_vein, test = 'LRT')
rsqrd <- lrm(COVID.19.Positive.~ ., data=numeric_covid_vein)


odds <- odds_plot(model_vein)
## Waiting for profiling to be done...
odds <- odds$odds_data
odds$OR <- log(odds$OR)
odds$lower <- log(odds$lower)
odds$upper <- log(odds$upper)

summary(model_vein)
## 
## Call:
## glm(formula = COVID.19.Positive. ~ ., family = binomial(link = "logit"), 
##     data = numeric_covid_vein)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1810  -0.5863  -0.3209  -0.1369   2.5755  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      5.931e-01  2.040e+00   0.291   0.7712  
## IFNg2           -7.633e-01  5.818e-01  -1.312   0.1895  
## IFNg3           -2.177e+00  8.497e-01  -2.562   0.0104 *
## IFNg4           -2.526e+00  1.106e+00  -2.285   0.0223 *
## RBD.IgG2         3.940e-01  7.221e-01   0.546   0.5853  
## RBD.IgG3        -1.417e+01  1.691e+03  -0.008   0.9933  
## RBD.IgG4        -6.375e-01  6.299e-01  -1.012   0.3114  
## Vaccinated.     -1.142e+00  1.695e+00  -0.673   0.5007  
## Prior.COVID.19. -1.455e+00  6.994e-01  -2.080   0.0375 *
## Gender           3.941e-01  6.510e-01   0.605   0.5450  
## Age              3.324e-03  2.304e-02   0.144   0.8853  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127.52  on 146  degrees of freedom
## Residual deviance: 100.31  on 136  degrees of freedom
## AIC: 122.31
## 
## Number of Fisher Scoring iterations: 15
ggplot(odds, aes(x = OR, y = vars)) + 
    geom_vline(aes(xintercept = 0), size = .25, linetype = "dashed") + 
    geom_errorbarh(aes(xmax = upper, xmin = lower), size = .5, height = 
                    .2, color = "gray50") +
    geom_point(size = 3.5, color = "orange") +
    theme_bw()+
    theme(panel.grid.minor = element_blank()) +
    ylab("") +
    xlab("log(Odds ratio)") +
    annotate(geom = "text", y =3, x = log10(1), 
                     label = paste("Model p <", round(model_pval$`Pr(>Chi)`[2], digits = 4)), size = 3.5, hjust = 4) + 
  annotate(geom = "text", y =3, x = log10(1), 
                     label = paste("Pseudo R^2 = ", round(rsqrd$stats[10][1], digits = 2)), size = 3.5, hjust = 3.53) + 
    ggtitle("Parameters and the risk of COVID infections") +
  scale_x_continuous(limits = c(-6,6))

# Odd as percentages
(exp(model_vein$coefficients[-1])-1)*100
##           IFNg2           IFNg3           IFNg4        RBD.IgG2        RBD.IgG3 
##     -53.3887035     -88.6598873     -92.0059861      48.2919251     -99.9999296 
##        RBD.IgG4     Vaccinated. Prior.COVID.19.          Gender             Age 
##     -47.1413159     -68.0686938     -76.6543272      48.3007191       0.3329825
# Development of final cross-validated using bestglm

numeric_covid_vein <- covid_data_vein[,c(2,3,4,5,6,7,8)]

q1 <- summary(numeric_covid_vein$IFNg)[2]
q2 <- summary(numeric_covid_vein$IFNg)[3]
q3 <- summary(numeric_covid_vein$IFNg)[5]

q1r <- summary(numeric_covid_vein$RBD.IgG)[2]
q2r <- summary(numeric_covid_vein$RBD.IgG)[3]
q3r <- summary(numeric_covid_vein$RBD.IgG)[5]

numeric_covid_vein[numeric_covid_vein$IFNg < q1,]$IFNg <- 1
numeric_covid_vein[numeric_covid_vein$IFNg >= q1 & numeric_covid_vein$IFNg < q2,]$IFNg <- 2
numeric_covid_vein[numeric_covid_vein$IFNg >= q2 & numeric_covid_vein$IFNg < q3,]$IFNg <- 3
numeric_covid_vein[numeric_covid_vein$IFNg >= q3,]$IFNg <- 4

numeric_covid_vein[numeric_covid_vein$RBD.IgG < q1r,]$RBD.IgG <- 1
numeric_covid_vein[numeric_covid_vein$RBD.IgG >= q1r & numeric_covid_vein$RBD.IgG < q2r,]$RBD.IgG <- 2
numeric_covid_vein[numeric_covid_vein$RBD.IgG >= q2r & numeric_covid_vein$RBD.IgG < q3r,]$RBD.IgG <- 3
numeric_covid_vein[numeric_covid_vein$RBD.IgG >= q3r,]$RBD.IgG <- 4

numeric_covid_vein$COVID.19.Positive. <- factor(numeric_covid_vein$COVID.19.Positive., levels = c(0,1))



library(bestglm)
## Loading required package: leaps
test_bestglm <-numeric_covid_vein[, c(names(numeric_covid_vein)[-1], "COVID.19.Positive.")]
names(test_bestglm)[7] <- "y"

test_bestglm$Gender <- as.numeric(test_bestglm$Gender)
test_bestglm$IFNg <- as.factor(test_bestglm$IFNg)
test_bestglm$RBD.IgG <- as.factor(test_bestglm$RBD.IgG)


bestglm_model <-
    bestglm(Xy = test_bestglm,
            family = binomial,
            IC = "AIC",                 # Information criteria for
            method = "exhaustive")
## Morgan-Tatar search since family is non-gaussian.
## Note: factors present with more than 2 levels.
summary(bestglm_model$BestModel)
## 
## Call:
## glm(formula = y ~ ., family = family, data = Xi, weights = weights)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0300  -0.5967  -0.3152  -0.1678   2.7477  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -0.3570     0.3896  -0.917  0.35940   
## IFNg2            -0.5693     0.5486  -1.038  0.29939   
## IFNg3            -2.1161     0.8178  -2.588  0.00966 **
## IFNg4            -2.6203     1.0835  -2.418  0.01559 * 
## Prior.COVID.19.  -1.2786     0.6040  -2.117  0.03426 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127.52  on 146  degrees of freedom
## Residual deviance: 104.26  on 142  degrees of freedom
## AIC: 114.26
## 
## Number of Fisher Scoring iterations: 6
model_null <- glm(y ~ 1, data=test_bestglm, family=binomial(link='logit'))
model_pval <- anova(model_null, bestglm_model$BestModel, test = 'LRT')
rsqrd <- lrm(COVID.19.Positive.~ ., data=numeric_covid_vein)



odds <- odds_plot(bestglm_model$BestModel)
## Waiting for profiling to be done...
odds <- odds$odds_data
write.csv(odds, file = "/Users/lorenzocapitani/OneDrive\ -\ Cardiff\ University/Bioinformatics/MartinPaper/odds_final.csv")
odds$OR <- log(odds$OR)
odds$lower <- log(odds$lower)
odds$upper <- log(odds$upper)


ggplot(odds, aes(x = OR, y = vars)) + 
    geom_vline(aes(xintercept = 0), size = .25, linetype = "dashed") + 
    geom_errorbarh(aes(xmax = upper, xmin = lower), size = .5, height = 
                    .2, color = "gray50") +
    geom_point(size = 3.5, color = "orange") +
    theme_bw()+
    theme(panel.grid.minor = element_blank()) +
    ylab("") +
    xlab("log(Odds ratio)") +
    annotate(geom = "text", y = 0.75, x = log10(800), 
                     label = paste("Model p <", round(model_pval$`Pr(>Chi)`[2], digits = 4)), size = 3.5, hjust = 4) + 
  annotate(geom = "text", y =0.5, x = log10(1200), 
                     label = paste("Pseudo R^2 = ", round(rsqrd$stats[10][1], digits = 2)), size = 3.5, hjust = 3.53) + 
    ggtitle("Parameters and the risk of COVID infections") +
  scale_x_continuous(limits = c(-6,6))

# Odd as percentages
(exp(bestglm_model$BestModel$coefficients[-1])-1)*100
##           IFNg2           IFNg3           IFNg4 Prior.COVID.19. 
##       -43.40530       -87.95015       -92.72200       -72.15823